Preparing the data

Load the required packages:

library(tidyverse)
library(glue)
library(scales)
library(Matrix)
library(scater)
library(scran)
library(pheatmap)
library(viridis)

Global parameters:

csv_dir <- "simulated_ovarian_bulk_sc_benchmarks_hvts_data"
dir.create(csv_dir, recursive = TRUE)
result_dir <- file.path("..", "simulated_ovarian_analysis", "report_data")
sample_ids <- c("SK-OV-3", "IGROV-1", "OVMANA", "OVKATE",
                "OVTOKO", "COV362")
top_n_hvts <- 6000
min_cor_spearman <- 0
max_cor_spearman <- 1.0
min_mean_rel_diff <- 0.2
max_mean_rel_diff <- 1.4
heatmap_palette <- cividis
heatmap_palette_direction <- 1
scatterplot_df_list <- list()

Helper functions:

fill_missing_matrix <- function(x, all_rownames) {
    missing_rownames <- setdiff(all_rownames, rownames(x))
    missing_matrix <- matrix(
        0, nrow = length(missing_rownames), ncol = ncol(x)
    )
    rownames(missing_matrix) <- missing_rownames
    full_matrix <- rbind(x, missing_matrix)
    full_matrix <- full_matrix[all_rownames,]
    return(full_matrix)
}
get_hvts <- function(count_matrix) {
    sce <- SingleCellExperiment(assays = list(counts = count_matrix))
    sce <- computeLibraryFactors(sce)
    sce <- logNormCounts(sce)
    dec <- modelGeneVar(sce)
    top_hvts <- getTopHVGs(dec, n = top_n_hvts)
    return(top_hvts)
}
get_mean_rel_diff <- function(x, y) {
    selector <- (x != 0) & (y != 0)
    x <- x[selector]
    y <- y[selector]
    mean(abs(x - y) / ((x + y) / 2))
}
get_mean_rel_diff_matrix <- function(x, y) {
    apply(y, 2, function(y_values) {
        apply(x, 2, function(x_values) {
            selector <- (y_values != 0) & (x_values != 0)
            y_values <- y_values[selector]
            x_values <- x_values[selector]
            mean(abs(x_values - y_values) / ((x_values + y_values) / 2))
        })
    })    
}
max_no_diag <- function(x, idx) {
    x[idx] <- NA
    max(x, na.rm = TRUE)
}
min_no_diag <- function(x, idx) {
    x[idx] <- NA
    min(x, na.rm = TRUE)
}
get_cor_diff <- function(cor_matrix) {
    output <- sapply(seq(nrow(cor_matrix)), function(idx) {
        x <- cor_matrix[idx,]
        x[seq(nrow(cor_matrix))] - max_no_diag(x, idx)
    })
    colnames(output) <- rownames(cor_matrix)
    return(output)
}
get_rel_diff_diff <- function(rel_diff_matrix) {
    output <- sapply(seq(nrow(rel_diff_matrix)), function(idx) {
        x <- rel_diff_matrix[idx,]
        min_no_diag(x, idx) - x[seq(nrow(rel_diff_matrix))]
    })
    colnames(output) <- rownames(rel_diff_matrix)
    return(output)
}
prepare_scatterplot_data <- function(tpm_matrix_bulk,
                                     tpm_matrix_pseudobulk) {
    tpm_bulk_df <- as.data.frame(tpm_matrix_bulk)
    bulk_levels <- colnames(tpm_bulk_df)
    tpm_bulk_df$transcript_id <- rownames(tpm_bulk_df)
    rownames(tpm_bulk_df) <- NULL
    tpm_bulk_df <- tpm_bulk_df %>%
        gather(-transcript_id, key = "sample_bulk", value = "tpm_bulk")
    tpm_bulk_df$sample_bulk <- factor(
        tpm_bulk_df$sample_bulk, levels = bulk_levels
    )
    tpm_pseudobulk_df <- as.data.frame(tpm_matrix_pseudobulk)
    pseudobulk_levels <- colnames(tpm_pseudobulk_df)
    tpm_pseudobulk_df$transcript_id <- rownames(tpm_pseudobulk_df)
    rownames(tpm_pseudobulk_df) <- NULL
    tpm_pseudobulk_df <- tpm_pseudobulk_df %>%
        gather(-transcript_id, key = "sample_pseudobulk",
               value = "tpm_pseudobulk")
    tpm_pseudobulk_df$sample_pseudobulk <- factor(
        tpm_pseudobulk_df$sample_pseudobulk, levels = pseudobulk_levels
    )
    tpm_df <- left_join(tpm_bulk_df, tpm_pseudobulk_df)
    tpm_df$status <- ifelse(
        as.numeric(tpm_df$sample_bulk) ==
            as.numeric(tpm_df$sample_pseudobulk),
        "Correct vs Correct", "Correct vs Decoy"
    )
    tpm_df$status <- factor(
        tpm_df$status,
        levels = c("Correct vs Correct", "Correct vs Decoy")
    )
    return(tpm_df)
}
create_scatterplot_paired <- function(plot_df) {
    plot_df$tpm_bulk[plot_df$tpm_bulk < 0.1] <- 0.1
    plot_df$tpm_pseudobulk[plot_df$tpm_pseudobulk < 0.1] <- 0.1
    scatter_plot <- ggplot(plot_df, mapping = aes(x = tpm_bulk,
                                                  y = tpm_pseudobulk)) +
        geom_point(alpha = 0.35, size = 0.05) +
        geom_abline(intercept = 0, slope = 1,
                    lty = 3, color = "red") +
        facet_grid(rows = vars(sample_pseudobulk),
                   cols = vars(sample_bulk)) +
        scale_x_log10(labels = trans_format("log10", math_format(10^.x))) +
        scale_y_log10(labels = trans_format("log10", math_format(10^.x))) +
        coord_cartesian(xlim = c(0.1, 10^5),
                        ylim = c(0.1, 10^5),
                        clip = "off") +
        annotation_logticks(outside = TRUE,
                            size = 0.25,
                            short = unit(0.05, "cm"),
                            mid = unit(0.1, "cm"),
                            long = unit(0.15, "cm")) +
        theme_bw() +
        theme(aspect.ratio = 1,
              axis.title.x = element_blank(),
              axis.title.y = element_blank())
    return(scatter_plot)
}
create_scatterplot_combined <- function(plot_df) {
    plot_df$tpm_bulk[plot_df$tpm_bulk < 0.1] <- 0.1
    plot_df$tpm_pseudobulk[plot_df$tpm_pseudobulk < 0.1] <- 0.1
    scatter_plot <- ggplot(plot_df, mapping = aes(x = tpm_bulk,
                                                  y = tpm_pseudobulk)) +
        geom_point(alpha = 0.35, size = 0.05) +
        geom_abline(intercept = 0, slope = 1,
                    lty = 3, color = "red") +
        facet_grid(cols = vars(status)) +
        scale_x_log10(labels = trans_format("log10", math_format(10^.x))) +
        scale_y_log10(labels = trans_format("log10", math_format(10^.x))) +
        coord_cartesian(xlim = c(0.1, 10^5),
                        ylim = c(0.1, 10^5),
                        clip = "off") +
        annotation_logticks(outside = TRUE,
                            size = 0.25,
                            short = unit(0.05, "cm"),
                            mid = unit(0.1, "cm"),
                            long = unit(0.15, "cm")) +
        theme_bw() +
        theme(aspect.ratio = 1,
              axis.title.x = element_blank(),
              axis.title.y = element_blank())
    return(scatter_plot)
}

Isosceles bulk-pseudobulk correlation plots

Prepare Isosceles scRNA-Seq data:

isosceles_sc_list <- lapply(sample_ids, function(sample_id) {
    se_transcript <- readRDS(file.path(
        result_dir, glue("isosceles_{sample_id}_Rep1_se_transcript.rds")
    ))
})
sc_counts <- do.call(
    cbind, lapply(isosceles_sc_list, function(se) {
        assay(se, "counts")
    })
)

Prepare Isosceles pseudobulk RNA-Seq data:

isosceles_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
    se_pseudobulk <- readRDS(file.path(
        result_dir, glue("isosceles_{sample_id}_Rep1_se_pseudobulk_transcript.rds")
    ))
})
tpm_pseudobulk <- do.call(
    cbind, lapply(isosceles_pseudobulk_list, function(se) {
        assay(se, "tpm")
    })
)
colnames(tpm_pseudobulk) <- sample_ids

Prepare Isosceles bulk RNA-Seq data:

isosceles_bulk_list <- lapply(sample_ids, function(sample_id) {
    se_transcript <- readRDS(file.path(
        result_dir, glue("isosceles_{sample_id}_Rep2_se_transcript.rds")
    ))
})
tpm_bulk <- do.call(
    cbind, lapply(isosceles_bulk_list, function(se) {
        assay(se, "tpm")
    })
)
colnames(tpm_bulk) <- sample_ids

Select top transcripts for further analysis:

top_transcripts <- get_hvts(sc_counts)

Re-normalize the TPM values:

tpm_bulk <- tpm_bulk[top_transcripts,]
tpm_pseudobulk <- tpm_pseudobulk[top_transcripts,]
tpm_bulk <- t(
    t(tpm_bulk) / colSums(tpm_bulk) * 1e6
)
tpm_pseudobulk <- t(
    t(tpm_pseudobulk) / colSums(tpm_pseudobulk) * 1e6
)

Calculate the correlation matrices:

cor_spearman <- cor(tpm_pseudobulk[top_transcripts,],
                    tpm_bulk[top_transcripts,],
                    method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
    tpm_pseudobulk[top_transcripts,],
    tpm_bulk[top_transcripts,]
)

Spearman correlation matrix:

as.data.frame(cor_spearman)

Spearman correlation heatmap:

pheatmap(
    cor_spearman,
    color = heatmap_palette(100, direction = heatmap_palette_direction),
    breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

Mean relative difference matrix:

as.data.frame(mean_rel_diff)

Mean relative difference heatmap:

pheatmap(
    mean_rel_diff,
    color = heatmap_palette(100, direction = -1),
    breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

TPM scatter plots:

scatterplot_df <- prepare_scatterplot_data(tpm_bulk,
                                           tpm_pseudobulk)
scatterplot_df$tool <- "Isosceles"
scatterplot_df_list[["Isosceles"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)

create_scatterplot_combined(scatterplot_df)

IsoQuant bulk-pseudobulk correlation plots

Prepare IsoQuant scRNA-Seq data:

isoquant_sc_list <- lapply(sample_ids, function(sample_id) {
    isoquant_df <- read_delim(file.path(
        result_dir, glue("isoquant_{sample_id}_Rep1_transcript_grouped_counts.tsv")
    ))
    isoquant_transcript_ids <- isoquant_df[, 1, drop = TRUE]
    isoquant_counts <- as.matrix(isoquant_df[, c(-1)])
    rownames(isoquant_counts) <- isoquant_transcript_ids
    colnames(isoquant_counts) <- paste0(
        sample_id, ".", colnames(isoquant_counts)
    )
    return(isoquant_counts)
})
isoquant_sc_tx_ids <- unique(unlist(sapply(
    isoquant_sc_list, rownames
)))
isoquant_sc_counts <- do.call(
    cbind, lapply(isoquant_sc_list, function(df) {
        fill_missing_matrix(df, isoquant_sc_tx_ids)
    })
)
isoquant_sc_counts <- as(isoquant_sc_counts, "dgCMatrix")

Prepare IsoQuant pseudobulk RNA-Seq data:

isoquant_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
    isoquant_df <- read_delim(file.path(
        result_dir, glue("isoquant_{sample_id}_Rep1_transcript_grouped_counts.tsv")
    ))
    isoquant_transcript_ids <- isoquant_df[, 1, drop = TRUE]
    isoquant_counts <- as(as.matrix(isoquant_df[, c(-1)]), "dgCMatrix")
    rownames(isoquant_counts) <- isoquant_transcript_ids
    isoquant_pseudobulk_counts <- rowSums(isoquant_counts)
    isoquant_pseudobulk_tpm <- isoquant_pseudobulk_counts /
        sum(isoquant_pseudobulk_counts) * 1e6
    return(isoquant_pseudobulk_tpm)
})
isoquant_pseudobulk_tx_ids <- unique(unlist(sapply(
    isoquant_pseudobulk_list, names
)))
isoquant_pseudobulk_tpm <- sapply(isoquant_pseudobulk_list,
                                  function(tpm_values) {
    tpm_values <- tpm_values[isoquant_pseudobulk_tx_ids]
    tpm_values[is.na(tpm_values)] <- 0
    return(tpm_values)
})
rownames(isoquant_pseudobulk_tpm) <- isoquant_pseudobulk_tx_ids
colnames(isoquant_pseudobulk_tpm) <- sample_ids

Prepare IsoQuant bulk RNA-Seq data:

isoquant_bulk_list <- lapply(sample_ids, function(sample_id) {
    isoquant_df <- read_delim(file.path(
        result_dir, glue("isoquant_{sample_id}_Rep2_transcript_tpm.tsv")
    ))
})
isoquant_bulk_tx_ids <- unique(unlist(sapply(
    isoquant_bulk_list, function(df) {df[, 1, drop = TRUE]}
)))
isoquant_bulk_tpm <- sapply(isoquant_bulk_list, function(df) {
    tpm_values <- df$TPM
    names(tpm_values) <- df[, 1, drop = TRUE]
    tpm_values <- tpm_values[isoquant_bulk_tx_ids]
    tpm_values[is.na(tpm_values)] <- 0
    return(tpm_values)
})
rownames(isoquant_bulk_tpm) <- isoquant_bulk_tx_ids
colnames(isoquant_bulk_tpm) <- sample_ids
isoquant_bulk_tpm <- fill_missing_matrix(isoquant_bulk_tpm,
                                         rownames(isoquant_pseudobulk_tpm))
isoquant_bulk_tpm <- t(
    t(isoquant_bulk_tpm) / colSums(isoquant_bulk_tpm) * 1e6
)

Select top transcripts for further analysis:

top_transcripts <- get_hvts(isoquant_sc_counts)

Re-normalize the TPM values:

isoquant_bulk_tpm <- isoquant_bulk_tpm[top_transcripts,]
isoquant_pseudobulk_tpm <- isoquant_pseudobulk_tpm[top_transcripts,]
isoquant_bulk_tpm <- t(
    t(isoquant_bulk_tpm) / colSums(isoquant_bulk_tpm) * 1e6
)
isoquant_pseudobulk_tpm <- t(
    t(isoquant_pseudobulk_tpm) / colSums(isoquant_pseudobulk_tpm) * 1e6
)

Calculate the correlation matrices:

cor_spearman <- cor(isoquant_pseudobulk_tpm[top_transcripts,],
                    isoquant_bulk_tpm[top_transcripts,],
                    method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
    isoquant_pseudobulk_tpm[top_transcripts,],
    isoquant_bulk_tpm[top_transcripts,]
)

Spearman correlation matrix:

as.data.frame(cor_spearman)

Spearman correlation heatmap:

pheatmap(
    cor_spearman,
    color = heatmap_palette(100, direction = heatmap_palette_direction),
    breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

Mean relative difference matrix:

as.data.frame(mean_rel_diff)

Mean relative difference heatmap:

pheatmap(
    mean_rel_diff,
    color = heatmap_palette(100, direction = -1),
    breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

TPM scatter plots:

scatterplot_df <- prepare_scatterplot_data(isoquant_bulk_tpm,
                                           isoquant_pseudobulk_tpm)
scatterplot_df$tool <- "IsoQuant"
scatterplot_df_list[["IsoQuant"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)

create_scatterplot_combined(scatterplot_df)

FLAMES bulk-pseudobulk correlation plots

Prepare FLAMES scRNA-Seq data:

flames_sc_list <- lapply(sample_ids, function(sample_id) {
    flames_df <- read_delim(file.path(
        result_dir, glue("flames_{sample_id}_Rep1_transcript_count.csv.gz")
    ))
    flames_df <- flames_df[!grepl("_", flames_df$transcript_id),]
    flames_transcript_ids <- flames_df$transcript_id
    flames_counts <- as.matrix(flames_df[, c(-1, -2)])
    rownames(flames_counts) <- flames_transcript_ids
    colnames(flames_counts) <- paste0(
        sample_id, ".", colnames(flames_counts)
    )
    return(flames_counts)
})
flames_sc_tx_ids <- unique(unlist(sapply(
    flames_sc_list, rownames
)))
flames_sc_counts <- do.call(
    cbind, lapply(flames_sc_list, function(df) {
        fill_missing_matrix(df, flames_sc_tx_ids)
    })
)
flames_sc_counts <- as(flames_sc_counts, "dgCMatrix")

Prepare FLAMES pseudobulk RNA-Seq data:

flames_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
    flames_df <- read_delim(file.path(
        result_dir, glue("flames_{sample_id}_Rep1_transcript_count.csv.gz")
    ))
    flames_df <- flames_df[!grepl("_", flames_df$transcript_id),]
    flames_transcript_ids <- flames_df$transcript_id
    flames_counts <- as(as.matrix(flames_df[, c(-1, -2)]), "dgCMatrix")
    rownames(flames_counts) <- flames_transcript_ids
    flames_pseudobulk_counts <- rowSums(flames_counts)
    flames_pseudobulk_tpm <- flames_pseudobulk_counts /
        sum(flames_pseudobulk_counts) * 1e6
    return(flames_pseudobulk_tpm)
})
flames_pseudobulk_tx_ids <- unique(unlist(sapply(
    flames_pseudobulk_list, names
)))
flames_pseudobulk_tpm <- sapply(flames_pseudobulk_list,
                                function(tpm_values) {
    tpm_values <- tpm_values[flames_pseudobulk_tx_ids]
    tpm_values[is.na(tpm_values)] <- 0
    return(tpm_values)
})
rownames(flames_pseudobulk_tpm) <- flames_pseudobulk_tx_ids
colnames(flames_pseudobulk_tpm) <- sample_ids

Prepare FLAMES bulk RNA-Seq data:

flames_bulk_list <- lapply(sample_ids, function(sample_id) {
    flames_df <- read_csv(file.path(
        result_dir, glue("flames_{sample_id}_Rep2_transcript_count.csv.gz")
    ))
    flames_df <- flames_df[!grepl("_", flames_df$transcript_id),]
    return(flames_df)
})
flames_bulk_tx_ids <- unique(unlist(sapply(
    flames_bulk_list, function(df) {df$transcript_id}
)))
flames_bulk_tpm <- sapply(flames_bulk_list, function(df) {
    tpm_values <- df[, 3, drop = TRUE]
    names(tpm_values) <- df$transcript_id
    tpm_values <- tpm_values[flames_bulk_tx_ids]
    tpm_values[is.na(tpm_values)] <- 0
    return(tpm_values)
})
rownames(flames_bulk_tpm) <- flames_bulk_tx_ids
colnames(flames_bulk_tpm) <- sample_ids
flames_bulk_tpm <- t(t(flames_bulk_tpm) / colSums(flames_bulk_tpm) * 1e6)
flames_bulk_tpm <- fill_missing_matrix(flames_bulk_tpm,
                                       rownames(flames_pseudobulk_tpm))
flames_bulk_tpm <- t(
    t(flames_bulk_tpm) / colSums(flames_bulk_tpm) * 1e6
)

Select top transcripts for further analysis:

top_transcripts <- get_hvts(flames_sc_counts)

Re-normalize the TPM values:

flames_bulk_tpm <- flames_bulk_tpm[top_transcripts,]
flames_pseudobulk_tpm <- flames_pseudobulk_tpm[top_transcripts,]
flames_bulk_tpm <- t(
    t(flames_bulk_tpm) / colSums(flames_bulk_tpm) * 1e6
)
flames_pseudobulk_tpm <- t(
    t(flames_pseudobulk_tpm) / colSums(flames_pseudobulk_tpm) * 1e6
)

Calculate the correlation matrices:

cor_spearman <- cor(flames_pseudobulk_tpm[top_transcripts,],
                    flames_bulk_tpm[top_transcripts,],
                    method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
    flames_pseudobulk_tpm[top_transcripts,],
    flames_bulk_tpm[top_transcripts,]
)

Spearman correlation matrix:

as.data.frame(cor_spearman)

Spearman correlation heatmap:

pheatmap(
    cor_spearman,
    color = heatmap_palette(100, direction = heatmap_palette_direction),
    breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

Mean relative difference matrix:

as.data.frame(mean_rel_diff)

Mean relative difference heatmap:

pheatmap(
    mean_rel_diff,
    color = heatmap_palette(100, direction = -1),
    breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

TPM scatter plots:

scatterplot_df <- prepare_scatterplot_data(flames_bulk_tpm,
                                           flames_pseudobulk_tpm)
scatterplot_df$tool <- "FLAMES"
scatterplot_df_list[["FLAMES"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)

create_scatterplot_combined(scatterplot_df)

Sicelore bulk-pseudobulk correlation plots

Prepare Sicelore scRNA-Seq data:

sicelore_sc_list <- lapply(sample_ids, function(sample_id) {
    sicelore_df <- read_delim(file.path(
        result_dir, glue("sicelore_{sample_id}_Rep1_sicelore_isomatrix.txt")
    ))
    sicelore_df <- sicelore_df[sicelore_df$transcriptId != "undef",]
    sicelore_transcript_ids <- sicelore_df$transcriptId
    sicelore_counts <- as.matrix(sicelore_df[, c(-1, -2, -3)])
    rownames(sicelore_counts) <- sicelore_transcript_ids
    colnames(sicelore_counts) <- paste0(
        sample_id, ".", colnames(sicelore_counts)
    )
    return(sicelore_counts)
})
sicelore_sc_tx_ids <- unique(unlist(sapply(
    sicelore_sc_list, rownames
)))
sicelore_sc_counts <- do.call(
    cbind, lapply(sicelore_sc_list, function(df) {
        fill_missing_matrix(df, sicelore_sc_tx_ids)
    })
)
sicelore_sc_counts <- as(sicelore_sc_counts, "dgCMatrix")

Prepare Sicelore pseudobulk RNA-Seq data:

sicelore_pseudobulk_list <- lapply(sample_ids, function(sample_id) {
    sicelore_df <- read_delim(file.path(
        result_dir, glue("sicelore_{sample_id}_Rep1_sicelore_isomatrix.txt")
    ))
    sicelore_df <- sicelore_df[sicelore_df$transcriptId != "undef",]
    sicelore_transcript_ids <- sicelore_df$transcriptId
    sicelore_counts <- as(as.matrix(sicelore_df[, c(-1, -2, -3)]),
                          "dgCMatrix")
    rownames(sicelore_counts) <- sicelore_transcript_ids
    sicelore_pseudobulk_counts <- rowSums(sicelore_counts)
    sicelore_pseudobulk_tpm <- sicelore_pseudobulk_counts /
        sum(sicelore_pseudobulk_counts) * 1e6
    return(sicelore_pseudobulk_tpm)
})
sicelore_pseudobulk_tx_ids <- unique(unlist(sapply(
    sicelore_pseudobulk_list, names
)))
sicelore_pseudobulk_tpm <- sapply(sicelore_pseudobulk_list,
                                  function(tpm_values) {
    tpm_values <- tpm_values[sicelore_pseudobulk_tx_ids]
    tpm_values[is.na(tpm_values)] <- 0
    return(tpm_values)
})
rownames(sicelore_pseudobulk_tpm) <- sicelore_pseudobulk_tx_ids
colnames(sicelore_pseudobulk_tpm) <- sample_ids

Prepare Sicelore bulk RNA-Seq data:

sicelore_bulk_list <- lapply(sample_ids, function(sample_id) {
    sicelore_df <- read_delim(file.path(
        result_dir, glue("sicelore_{sample_id}_Rep2_sicelore_isomatrix.txt")
    ))
    sicelore_df <- sicelore_df[sicelore_df$transcriptId != "undef",]
    return(sicelore_df)
})
sicelore_bulk_tx_ids <- unique(unlist(sapply(
    sicelore_bulk_list, function(df) {df$transcriptId}
)))
sicelore_bulk_tpm <- sapply(sicelore_bulk_list, function(df) {
    tpm_values <- df[, 4, drop = TRUE]
    names(tpm_values) <- df$transcriptId
    tpm_values <- tpm_values[sicelore_bulk_tx_ids]
    tpm_values[is.na(tpm_values)] <- 0
    return(tpm_values)
})
rownames(sicelore_bulk_tpm) <- sicelore_bulk_tx_ids
colnames(sicelore_bulk_tpm) <- sample_ids
sicelore_bulk_tpm <- t(
    t(sicelore_bulk_tpm) / colSums(sicelore_bulk_tpm) * 1e6
)
sicelore_bulk_tpm <- fill_missing_matrix(sicelore_bulk_tpm,
                                         rownames(sicelore_pseudobulk_tpm))
sicelore_bulk_tpm <- t(
    t(sicelore_bulk_tpm) / colSums(sicelore_bulk_tpm) * 1e6
)

Select top transcripts for further analysis:

top_transcripts <- get_hvts(sicelore_sc_counts)

Re-normalize the TPM values:

sicelore_bulk_tpm <- sicelore_bulk_tpm[top_transcripts,]
sicelore_pseudobulk_tpm <- sicelore_pseudobulk_tpm[top_transcripts,]
sicelore_bulk_tpm <- t(
    t(sicelore_bulk_tpm) / colSums(sicelore_bulk_tpm) * 1e6
)
sicelore_pseudobulk_tpm <- t(
    t(sicelore_pseudobulk_tpm) / colSums(sicelore_pseudobulk_tpm) * 1e6
)

Calculate the correlation matrices:

cor_spearman <- cor(sicelore_pseudobulk_tpm[top_transcripts,],
                    sicelore_bulk_tpm[top_transcripts,],
                    method = "spearman")
mean_rel_diff <- get_mean_rel_diff_matrix(
    sicelore_pseudobulk_tpm[top_transcripts,],
    sicelore_bulk_tpm[top_transcripts,]
)

Spearman correlation matrix:

as.data.frame(cor_spearman)

Spearman correlation heatmap:

pheatmap(
    cor_spearman,
    color = heatmap_palette(100, direction = heatmap_palette_direction),
    breaks = seq(min_cor_spearman, max_cor_spearman, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

Mean relative difference matrix:

as.data.frame(mean_rel_diff)

Mean relative difference heatmap:

pheatmap(
    mean_rel_diff,
    color = heatmap_palette(100, direction = -1),
    breaks = seq(min_mean_rel_diff, max_mean_rel_diff, length.out = 101),
    cluster_rows = FALSE, cluster_cols = FALSE
)

TPM scatter plots:

scatterplot_df <- prepare_scatterplot_data(sicelore_bulk_tpm,
                                           sicelore_pseudobulk_tpm)
scatterplot_df$tool <- "Sicelore"
scatterplot_df_list[["Sicelore"]] <- scatterplot_df
create_scatterplot_paired(scatterplot_df)

create_scatterplot_combined(scatterplot_df)

Summary barplots

Prepare summary barplot data:

plot_df <- do.call(rbind, scatterplot_df_list)
rownames(plot_df) <- NULL
plot_df$tool <- factor(plot_df$tool, levels = c("Isosceles", "IsoQuant",
                                                "FLAMES", "Sicelore"))
plot_df <- plot_df %>%
    group_by(tool, sample_bulk, sample_pseudobulk) %>%
    summarise(
        status = unique(status),
        corr_spearman = cor(tpm_bulk, tpm_pseudobulk,
                            method = "spearman"),
        mean_rel_diff = get_mean_rel_diff(tpm_bulk, tpm_pseudobulk)
    ) %>%
    ungroup()
plot_df <- plot_df %>%
    group_by(tool, status) %>%
    summarise(
        corr_spearman_avg = mean(corr_spearman),
        corr_spearman_se = sd(corr_spearman) / sqrt(n()),
        mean_rel_diff_avg = mean(mean_rel_diff),
        mean_rel_diff_se = sd(mean_rel_diff) / sqrt(n())
    ) %>%
    ungroup()

Summary barplot (Spearman correlation):

ggplot(plot_df, mapping = aes(x = tool,
                              y = corr_spearman_avg,
                              fill = status)) +
    geom_col(position = position_dodge(), color = "black", width = 0.5) +
    geom_errorbar(
        mapping = aes(ymin = corr_spearman_avg - corr_spearman_se,
                      ymax = corr_spearman_avg + corr_spearman_se),
        width = 0.1,
        color = "black",
        position = position_dodge(0.5)
    ) + 
    scale_fill_manual(values = c(`Correct vs Correct` = "grey",
                                 `Correct vs Decoy` = "darkred")) +
    coord_cartesian(ylim = c(0, 1)) +
    labs(y = "Correlation",
         fill = "") +
    theme_classic() +
    theme(legend.position = "top",
          legend.key.size = unit(0.4, "cm"),
          aspect.ratio = 1,
          axis.title.x = element_blank(),
          axis.text.x = element_text(size = 8))

Summary barplot (mean relative difference):

ggplot(plot_df, mapping = aes(x = tool,
                              y = mean_rel_diff_avg,
                              fill = status)) +
    geom_col(position = position_dodge(), color = "black", width = 0.5) +
    geom_errorbar(
        mapping = aes(ymin = mean_rel_diff_avg - mean_rel_diff_se,
                      ymax = mean_rel_diff_avg + mean_rel_diff_se),
        width = 0.1,
        color = "black",
        position = position_dodge(0.5)
    ) + 
    scale_fill_manual(values = c(`Correct vs Correct` = "grey",
                                 `Correct vs Decoy` = "darkred")) +
    coord_cartesian(ylim = c(0, max_mean_rel_diff + 0.1)) +
    labs(y = "Mean rel. diff.",
         fill = "") +
    theme_classic() +
    theme(legend.position = "top",
          legend.key.size = unit(0.4, "cm"),
          aspect.ratio = 1,
          axis.title.x = element_blank(),
          axis.text.x = element_text(size = 8))

Metric difference and CI table:

get_diff_value <- function(x) {
    return(abs(x[1] - x[2]))
}
get_diff_sd <- function(x) {
    return(sqrt(sum(x ** 2)))
}
get_ci_factor <- function(x) {
    qnorm(1 - (1 - x)/2)
}
diff_df <- plot_df %>%
    group_by(tool) %>%
    summarise(
        corr_spearman = get_diff_value(corr_spearman_avg),
        corr_spearman_upper_ci_95 = get_diff_value(corr_spearman_avg) +
            (get_diff_sd(corr_spearman_se) * get_ci_factor(0.95)),
        corr_spearman_lower_ci_95 = get_diff_value(corr_spearman_avg) -
            (get_diff_sd(corr_spearman_se) * get_ci_factor(0.95)),
        corr_spearman_upper_ci_90 = get_diff_value(corr_spearman_avg) +
            (get_diff_sd(corr_spearman_se) * get_ci_factor(0.90)),
        corr_spearman_lower_ci_90 = get_diff_value(corr_spearman_avg) -
            (get_diff_sd(corr_spearman_se) * get_ci_factor(0.90)),
        corr_spearman_upper_ci_68 = get_diff_value(corr_spearman_avg) +
            (get_diff_sd(corr_spearman_se) * get_ci_factor(0.68)),
        corr_spearman_lower_ci_68 = get_diff_value(corr_spearman_avg) -
            (get_diff_sd(corr_spearman_se) * get_ci_factor(0.68)),
        mean_rel_diff = get_diff_value(mean_rel_diff_avg),
        mean_rel_diff_upper_ci_95 = get_diff_value(mean_rel_diff_avg) +
            (get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.95)),
        mean_rel_diff_lower_ci_95 = get_diff_value(mean_rel_diff_avg) -
            (get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.95)),
        mean_rel_diff_upper_ci_90 = get_diff_value(mean_rel_diff_avg) +
            (get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.90)),
        mean_rel_diff_lower_ci_90 = get_diff_value(mean_rel_diff_avg) -
            (get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.90)),
        mean_rel_diff_upper_ci_68 = get_diff_value(mean_rel_diff_avg) +
            (get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.68)),
        mean_rel_diff_lower_ci_68 = get_diff_value(mean_rel_diff_avg) -
            (get_diff_sd(mean_rel_diff_se) * get_ci_factor(0.68))
    )
diff_df <- as.data.frame(diff_df)
rownames(diff_df) <- diff_df$tool
diff_df$tool <- NULL
diff_df <- as.data.frame(t(diff_df))
write.csv(diff_df, file.path(csv_dir, glue("data_{top_n_hvts}_hvts.csv")))
diff_df

Session Info

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C         LC_TIME=C           
##  [4] LC_COLLATE=C         LC_MONETARY=C        LC_MESSAGES=C       
##  [7] LC_PAPER=C           LC_NAME=C            LC_ADDRESS=C        
## [10] LC_TELEPHONE=C       LC_MEASUREMENT=C     LC_IDENTIFICATION=C 
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] viridis_0.6.2               viridisLite_0.4.0          
##  [3] pheatmap_1.0.12             scran_1.24.0               
##  [5] scater_1.24.0               scuttle_1.6.2              
##  [7] SingleCellExperiment_1.18.0 SummarizedExperiment_1.26.1
##  [9] Biobase_2.56.0              GenomicRanges_1.48.0       
## [11] GenomeInfoDb_1.32.2         IRanges_2.30.0             
## [13] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [15] MatrixGenerics_1.8.1        matrixStats_0.62.0         
## [17] Matrix_1.4-1                scales_1.2.0               
## [19] glue_1.6.2                  forcats_0.5.1              
## [21] stringr_1.4.0               dplyr_1.0.9                
## [23] purrr_0.3.4                 readr_2.1.2                
## [25] tidyr_1.2.0                 tibble_3.1.7               
## [27] ggplot2_3.3.6               tidyverse_1.3.2            
## 
## loaded via a namespace (and not attached):
##  [1] googledrive_2.0.0         ggbeeswarm_0.6.0         
##  [3] colorspace_2.0-3          ellipsis_0.3.2           
##  [5] bluster_1.6.0             XVector_0.36.0           
##  [7] BiocNeighbors_1.14.0      fs_1.5.2                 
##  [9] rstudioapi_0.13           farver_2.1.1             
## [11] bit64_4.0.5               ggrepel_0.9.1            
## [13] fansi_1.0.3               lubridate_1.8.0          
## [15] xml2_1.3.3                codetools_0.2-18         
## [17] sparseMatrixStats_1.8.0   cachem_1.0.6             
## [19] knitr_1.39                jsonlite_1.8.0           
## [21] broom_1.0.0               cluster_2.1.3            
## [23] dbplyr_2.2.1              compiler_4.2.0           
## [25] httr_1.4.3                dqrng_0.3.0              
## [27] backports_1.4.1           assertthat_0.2.1         
## [29] fastmap_1.1.0             limma_3.52.2             
## [31] gargle_1.2.0              cli_3.3.0                
## [33] BiocSingular_1.12.0       htmltools_0.5.3          
## [35] tools_4.2.0               rsvd_1.0.5               
## [37] igraph_1.3.4              gtable_0.3.0             
## [39] GenomeInfoDbData_1.2.8    Rcpp_1.0.9               
## [41] cellranger_1.1.0          jquerylib_0.1.4          
## [43] vctrs_0.4.1               DelayedMatrixStats_1.18.0
## [45] xfun_0.31                 beachmat_2.12.0          
## [47] rvest_1.0.2               lifecycle_1.0.1          
## [49] irlba_2.3.5               statmod_1.4.36           
## [51] googlesheets4_1.0.0       edgeR_3.38.2             
## [53] zlibbioc_1.42.0           vroom_1.5.7              
## [55] hms_1.1.1                 parallel_4.2.0           
## [57] RColorBrewer_1.1-3        yaml_2.3.5               
## [59] gridExtra_2.3             sass_0.4.2               
## [61] stringi_1.7.8             highr_0.9                
## [63] ScaledMatrix_1.4.0        BiocParallel_1.30.3      
## [65] rlang_1.0.4               pkgconfig_2.0.3          
## [67] bitops_1.0-7              archive_1.1.5            
## [69] evaluate_0.15             lattice_0.20-45          
## [71] labeling_0.4.2            bit_4.0.4                
## [73] tidyselect_1.1.2          magrittr_2.0.3           
## [75] R6_2.5.1                  generics_0.1.3           
## [77] metapod_1.4.0             DelayedArray_0.22.0      
## [79] DBI_1.1.3                 pillar_1.7.0             
## [81] haven_2.5.0               withr_2.5.0              
## [83] RCurl_1.98-1.8            modelr_0.1.8             
## [85] crayon_1.5.1              utf8_1.2.2               
## [87] tzdb_0.3.0                rmarkdown_2.14           
## [89] locfit_1.5-9.6            grid_4.2.0               
## [91] readxl_1.4.0              reprex_2.0.1             
## [93] digest_0.6.29             munsell_0.5.0            
## [95] beeswarm_0.4.0            vipor_0.4.5              
## [97] bslib_0.4.0